// Set your root directory
local root "/Set/Root/Here"
cd "`root'/stata/FARS"

**************************************************************************
* The .do file does everything from bringing in raw data to final analysis
* The basic ordering is:
* (1) bring in FARS data, create county by month-of-sample dataset
* (2) bring in company data, create county by month-of-sample dataset
* (3) merge 1:1 and create event study figures
* As explained below, the company data must be brought in manually
* This requires you to go in and open do file again, no big deal
**************************************************************************


*-----------------------------------------------------------*
*------BRING IN COUNTY AND CITY CODES FROM GSA--------------*
*-----------------------------------------------------------*
* Bring in State and County Names
* Downloaded from https://www.gsa.gov/graphics/ogp/FRPP_GLC_UnitedStates.xls
* Looks like same or very similar file is available here: https://www.gsa.gov/portal/content/102761
* At least for counties, these GSA codes are also known as FIPS codes used by Census Bureau
* February 2017, General Services Administration
* Observations in these GSA data are either county or city, e.g. county observations have a blank citycode
* But for each observation these data give only one or the other, not both
* Consequently, I created separate datasets with county and city names because I want both labeled
insheet using "`root'/raw data/GSA/FRPP_GLC_UnitedStates.csv", clear
drop if statename == ""
assert territory == "U"
drop territory
rename state stateabbrev 
drop if citynamecountyname=="UNKNOWN"

* Make a dataset with every county in the United States
* To make the county crosswalk I keep only observations without a citycode
preserve
keep if citycode == .
drop citycode
rename citynamecountyname countyname
destring, replace
compress
sort statecode countycode 
save "`root'/stata/FARS/intermediate/GSAcounties.dta", replace

* Make a dataset with every city in the United States
* For the city crosswalk, I drop observations without a citycode
* There are a large number of cities that are in multiple counties
* As well as a very small number of duplicate citycodes, e.g. NY's 4170 is both "NEW YORK CITY" and "MIDDLE VILLAGE"
restore
drop if citycode==.
drop countycode
rename citynamecountyname cityname
destring, replace
compress 
drop if cityname == "DENALI BOROUGH" & citycode == 0
duplicates drop statename stateabbrev statecode cityname citycode, force
duplicates drop statecode citycode, force
replace cityname = "NEW YORK CITY" if cityname == "MIDDLE VILLAGE" & statename == "NEW YORK"
sort statecode citycode
save "`root'/stata/FARS/intermediate/GSAcities.dta", replace

* Downloaded from NHTSA website ftp://ftp.nhtsa.dot.gov/fars/ during Feb 2017 
* For 2015, I simply downloaded as CSV, unzipped, and add the year to the file name
* For earlier years CSV is not available so i downloaded DBF, unzipped, StatTransferred, and added year to file name
* I'm using only the `accident' file, but will need the `vehicle' and `person' files for driver age and gender, for example
* Year 2016 data likely available in August 2017

* VE_TOTAL - # of vehicles involved in the crash
* PERSONS - # of people involved in the crash
* PEDS - # of pedestrians involved in the crash
* NO_LANES - # of lanes of the road the crash occurred on
* SP_LIMIT – The speed limit at the location of the crash
* FATALS - # of people who died as a result of the crash
* DRUNK_DR - # of alcohol influenced drivers involved in the crash

*---------------------------------------------*
*------BRING IN ACCIDENT DATA FROM FARS-------*
*---------------------------------------------*

* Bring in Raw Data
insheet using "`root'/raw data/FARS/accident2000.csv", clear 
tempfile temp1 tempAccident tempVehicle FARS
save `temp1', replace
forvalues x = 2001(1)2016 {
	insheet using "`root'/raw data/FARS/accident`x'.csv", clear 
	append using `temp1'
	save `temp1', replace
}
 
keep st_case state county city day month year hour day_week fatals drunk_dr ve_total persons peds
rename state statecode
rename county countycode
rename city citycode
destring, replace
compress
drop if countycode==0 | countycode==997 | countycode==999

* Bring in county names
* A tiny number of U.S. counties had no fatal accidents during sample (_merge==2)
* Florida also has in 2000 a county (#25) that for some reason disappeared after 2000 and is not in GSA codes
sort statecode countycode
merge m:1 statecode countycode using "`root'/stata/FARS/intermediate/GSAcounties.dta"
drop if _merge == 2
drop if _merge == 1 & year == 2000 & statecode == 12 & countycode == 25
drop if _merge == 1 & statecode == 2 & countycode == 201
drop if _merge == 1 // A small number of additional counties are also missing
assert _merge == 3
drop _merge

* Bring in city names
* In the FARS data there are a large number of 0's for citycode, presumably for accidents outside cities
* These, as well as other citycodes for which GSA codes aren't listed get `unknown'
sort statecode citycode
merge m:1 statecode citycode using "`root'/stata/FARS/intermediate/GSAcities.dta"  
tab _merge if citycode != 0
replace cityname = "UNKNOWN" if _merge == 1
drop if _merge == 2
drop _merge
sort year st_case
save `tempAccident', replace

*-------------------------------------------------------*
*------BRING IN VEHICLE AND PERSON DATA FROM FARS-------*
*-------------------------------------------------------*
* Downloaded from NHTSA website ftp://ftp.nhtsa.dot.gov/fars/ during Feb 2017
* Kept only the vehicle and person files
* Added the year to each file manually
* In most years data are available as .DBF which i converted to CSV using Stattransfer
* st_case uniquely identifies each accident
* st_case veh_no uniquely identifies each vehicle
* st_case veh_no per_no uniquely identified each person

* Bring in raw vehicle data
* Odd string error with mcarr_id, a variable I'm not using, so i drop it
insheet using "`root'/raw data/FARS/vehicle2000.csv", clear
gen year = 2000
save `temp1', replace
forvalues x = 2001(1)2016 {
	 insheet using "`root'/raw data/FARS/vehicle`x'.csv", clear
	 gen year=`x'
	 capture: drop mcarr_id
	 append using `temp1'
	 save `temp1', replace
}
keep year st_case state veh_no numoccs deaths dr_drink prev_dwi p_crash2

* Label variables
label variable numoccs "Number of Occupants" 
label variable deaths "Number of Fatalities in the Vehicle" 
label variable dr_drink "Indicator variable for whether driver had been drinking" 
label variable prev_dwi "Number of previous DWI convictions within 5 years"
label variable p_crash2 "the critical event that made this crash imminent"

* Keep variable for *any* person having previous dwi
* This is the number of previous DWIs by driver in last 3 years (or 5 years starting 2015)
tab prev_dwi
gen previousDWI = 0
replace previousDWI = 1 if prev_dwi >= 1 & prev_dwi <= 9
replace numoccs = 1 if numoccs >= 97
collapse (sum) previousDWI numoccs, by(year st_case)
sum
sort year st_case
save `tempVehicle', replace

* Bring in person data
* There is more data on alcohol testing that I'm not currently using
insheet using "`root'/raw data/FARS/person2000.csv", clear
gen year = 2000
save `temp1', replace
forvalues x = 2001(1)2016 {
	insheet using "`root'/raw data/FARS/person`x'.csv", clear
	gen year = `x'
	append using `temp1'
	save `temp1', replace
}
keep year st_case state veh_no per_no age sex per_typ drinking inj_sev

* Label person type, then keep drivers only
label variable per_typ "Person Type"
capture: label drop per_typ
label define per_typ 1 "Driver" 2 "Passenger" 3 "Occupant of Parked Motor Vehicle" ///
                      4 "Occupant of Parked Non-Motor Vehicle" 5 "Pedestrian" 6 "Bicylist" ///
					  7 "Other" 8 "Other" 9 "Other" 10 "Other" 19 "Other"  
label value per_typ per_typ
tab per_typ
drop per_typ


* Label other variables
label variable drinking "Indicator for whether alcohol was involved for this person"
replace drinking = 0 if drinking == 8 | drinking == 9 // assign zero for not reported or unknown

* Keep fatalities only
tab inj_sev
keep if inj_sev == 4 | inj_sev == 6
gen deadmales = 0
replace deadmales = 1 if sex == 1
gen deadfemales = 0
replace deadfemales = 1 if sex == 2
gen dead10s = 0
replace dead10s = 1 if age < 20
gen dead20s = 0
replace dead20s = 1 if age >= 20 & age <= 29
gen dead30s = 0
replace dead30s = 1 if age >= 30 & age <= 39
gen dead40s = 0
replace dead40s = 1 if age >= 40 & age <= 49
gen dead50s = 0
replace dead50s = 1 if age >= 50 & age <= 59
gen dead60s = 0
replace dead60s = 1 if age >= 60

* Collapse by crash
collapse (sum) dead*, by(year st_case) 

* Merge with vehicle data
sum
sort year st_case
merge 1:1 year st_case using `tempVehicle'
assert _merge == 3
drop _merge

* Merge with FARS accident data
* Very small <.1% observations without match in accident data, presumably typos
sort year st_case
merge m:1 year st_case using `tempAccident'
assert _merge != 2
keep if _merge == 3
drop _merge
save `FARS'

*-------------------------------------------------*
*--------------COLLAPSE FURTHER-------------------*
*-------------------------------------------------*

use "`root'/raw data/other/cities.dta", clear
sort statename cityname
merge 1:m statename cityname using "`FARS'"
assert _merge != 1

* Create Separate Counts of Fatalities involving Alcohol and At Night
gen fatals_alcohol = 0
replace fatals_alcohol = fatals if drunk_dr > 0

* Examine day-of-week, coding from FARS manual, though I confirm here this matches dates
label variable day_week "Day of Week"
capture: label drop day_week
label define day_week 1 "Sunday" 2 "Monday" 3 "Tuesday" 4 "Wednesday" 5 "Thursday" 6 "Friday" 7 "Saturday" 9 "Unknown"
label value day_week day_week
tab day_week
gen date = mdy(month,day,year)
gen dow = dow(date)
tab dow
drop date dow

* Generate variable measuring fatalities on weekend nights
gen weekendnight = 0
replace weekendnight = 1 if (day_week == 6 | day_week == 7 | day_week == 1) & ((hour >= 20 | hour <= 4) & hour != 99) 
tab weekendnight
gen fatals_weekend = fatals * weekendnight 

* Generate variables for non-alcohol, and non-weekend related fatalities
gen fatals_nonalcohol = 0
replace fatals_nonalcohol = fatals if drunk_dr == 0
gen fatals_nonweekend = 0
replace fatals_nonweekend = fatals if weekendnight == 0
rename fatals fatals_total

* Categories by gender
gen yalc_males = 0
replace yalc_males = deadmales if drunk_dr != 0
gen yalc_females = 0
replace yalc_females = deadfemales if drunk_dr != 0

* Categories by age
gen yalc_age10 = 0
replace yalc_age10 = dead10s if drunk_dr != 0
gen yalc_age20 = 0
replace yalc_age20 = dead20s if drunk_dr != 0
gen yalc_age30 = 0
replace yalc_age30 = dead30s if drunk_dr != 0
gen yalc_age40 = 0
replace yalc_age40 = dead40s if drunk_dr != 0
gen yalc_age50 = 0
replace yalc_age50 = dead50s if drunk_dr != 0
gen yalc_age60 = 0
replace yalc_age60 = dead60s if drunk_dr != 0

* Categories by time-of-day
gen yalc_6am = 0
replace yalc_6am = fatals_alcohol if hour >= 6 & hour < 12
gen yalc_noon = 0
replace yalc_noon = fatals_alcohol if hour >= 12 & hour < 18
gen yalc_6pm = 0
replace yalc_6pm = fatals_alcohol if hour >= 18 & hour < 24
gen yalc_midn = 0
replace yalc_midn = fatals_alcohol if hour >= 0 & hour < 6

* Categories by hour-of-day
foreach x of numlist 0/23 {
	gen yalc_hour`x' = 0
	replace yalc_hour`x' = fatals_alcohol if hour == `x'
}

* Categories by day-of-week
foreach x of numlist 1/7 {
	gen yalc_dow`x' = 0
	replace yalc_dow`x' = fatals_alcohol if day_week == `x'
}

* Categories by day-of-week
foreach x of numlist 1/12 {
	gen yalc_moy`x' = 0
	replace yalc_moy`x' = fatals_alcohol if month == `x'
}

* Save complete dataset for some descriptive analysis
save completeFARS.dta, replace
